/* -*- Mode: C; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */

#ifndef RR_PRELOAD_INTERFACE_H_
#define RR_PRELOAD_INTERFACE_H_

/* Bump this whenever the interface between syscallbuf and rr changes in a way
 * that would require changes to replay. So be very careful making changes to
 * this file! Many changes would require a bump in this value, and support
 * code in rr to handle old protocol versions. And when we bump it we'll need
 * to figure out a way to test the old protocol versions.
 * To be clear, changes that only affect recording and not replay, such as
 * changes to the layout of syscall_patch_hook, do not need to bump this.
 * Note also that SYSCALLBUF_PROTOCOL_VERSION is stored in the trace header, so
 * replay always has access to the SYSCALLBUF_PROTOCOL_VERSION used during
 * recording, even before the preload library is ever loaded.
 *
 * Version 0: initial rr 5.0.0 release
 */
#define SYSCALLBUF_PROTOCOL_VERSION 0

#if defined(RR_IMPLEMENT_PRELOAD) || defined(RR_IMPLEMENT_AUDIT)
/* Avoid using <string.h> library functions */
static inline int streq(const char* s1, const char* s2) {
  while (1) {
    if (*s1 != *s2) {
      return 0;
    }
    if (!*s1) {
      return 1;
    }
    ++s1;
    ++s2;
  }
  return 1;
}
static inline size_t rrstrlen(const char* s) {
  size_t ret = 0;
  while (*s) {
    ++s;
    ++ret;
  }
  return ret;
}
#else
#include <string.h>
static inline int streq(const char* s1, const char* s2) {
  return !strcmp(s1, s2);
}
static inline size_t rrstrlen(const char* s) { return strlen(s); }
#include "../remote_ptr.h"
#endif

#include <stdint.h>
#include <stddef.h>

static inline int strprefix(const char* s1, const char* s2) {
  while (1) {
    if (!*s1) {
      return 1;
    }
    if (*s1 != *s2) {
      return 0;
    }
    ++s1;
    ++s2;
  }
  return 1;
}

static inline const char* extract_file_name(const char* s) {
  const char* ret = s;
  while (*s) {
    if (*s == '/') {
      ret = s + 1;
    }
    ++s;
  }
  return ret;
}

/* This header file is included by preload.c and various rr .cc files. It
 * defines the interface between the preload library and rr. preload.c
 * #defines RR_IMPLEMENT_PRELOAD to let us handle situations where rr and
 * preload.c need to see slightly different definitions of the same constructs.
 *
 * preload.c compiles this as C code. All rr modules compile this as C++ code.
 * We do not use 'extern "C"' because we don't actually link between C and C++
 * and 'extern "C"' is not compatible with our use of templates below.
 */

#define SYSCALLBUF_LIB_FILENAME_BASE "librrpreload"
#define SYSCALLBUF_LIB_FILENAME SYSCALLBUF_LIB_FILENAME_BASE ".so"
#define SYSCALLBUF_LIB_FILENAME_PADDED SYSCALLBUF_LIB_FILENAME_BASE ".so:::"
#define SYSCALLBUF_LIB_FILENAME_32 SYSCALLBUF_LIB_FILENAME_BASE "_32.so"

#define RTLDAUDIT_LIB_FILENAME_BASE "librraudit"
#define RTLDAUDIT_LIB_FILENAME RTLDAUDIT_LIB_FILENAME_BASE ".so"
#define RTLDAUDIT_LIB_FILENAME_PADDED RTLDAUDIT_LIB_FILENAME_BASE ".so:::"
#define RTLDAUDIT_LIB_FILENAME_32 RTLDAUDIT_LIB_FILENAME_BASE "_32.so"

#define RRPAGE_LIB_FILENAME_BASE "librrpage"
#define RRPAGE_LIB_FILENAME RRPAGE_LIB_FILENAME_BASE ".so"
#define RRPAGE_LIB_FILENAME_32 RRPAGE_LIB_FILENAME_BASE "_32.so"

/* Set this env var to enable syscall buffering. */
#define SYSCALLBUF_ENABLED_ENV_VAR "_RR_USE_SYSCALLBUF"

/* Size of table mapping fd numbers to syscallbuf-disabled flag. */
#define SYSCALLBUF_FDS_DISABLED_SIZE 16384

#define MPROTECT_RECORD_COUNT 1000

#if defined(__x86_64__) || defined(__i386__)
#define RR_PAGE_SYSCALL_STUB_SIZE 3
#define RR_PAGE_SYSCALL_INSTRUCTION_END 2
#elif defined(__aarch64__)
#define RR_PAGE_SYSCALL_STUB_SIZE 8
#define RR_PAGE_SYSCALL_INSTRUCTION_END 4
#else
#error "Must be defined for this architecture"
#endif

/* Must match generate_rr_page.py */
#define RR_PAGE_ADDR 0x70000000
#ifdef __aarch64__
#define PRELOAD_LIBRARY_PAGE_SIZE 65536
#else
#define PRELOAD_LIBRARY_PAGE_SIZE 4096
#endif
#define RR_PAGE_SYSCALL_ADDR(index)                                            \
  ((void*)(RR_PAGE_ADDR + RR_PAGE_SYSCALL_STUB_SIZE * (index)))
#define RR_PAGE_SYSCALL_TRACED RR_PAGE_SYSCALL_ADDR(0)
#define RR_PAGE_SYSCALL_PRIVILEGED_TRACED RR_PAGE_SYSCALL_ADDR(1)
#define RR_PAGE_SYSCALL_UNTRACED RR_PAGE_SYSCALL_ADDR(2)
#define RR_PAGE_SYSCALL_UNTRACED_REPLAY_ONLY RR_PAGE_SYSCALL_ADDR(3)
#define RR_PAGE_SYSCALL_UNTRACED_RECORDING_ONLY RR_PAGE_SYSCALL_ADDR(4)
#define RR_PAGE_SYSCALL_PRIVILEGED_UNTRACED RR_PAGE_SYSCALL_ADDR(5)
#define RR_PAGE_SYSCALL_PRIVILEGED_UNTRACED_REPLAY_ONLY RR_PAGE_SYSCALL_ADDR(6)
#define RR_PAGE_SYSCALL_PRIVILEGED_UNTRACED_RECORDING_ONLY                     \
  RR_PAGE_SYSCALL_ADDR(7)
#define RR_PAGE_SYSCALL_UNTRACED_REPLAY_ASSIST RR_PAGE_SYSCALL_ADDR(8)
#define RR_PAGE_IN_REPLAY_FLAG (RR_PAGE_ADDR + RR_PAGE_SYSCALL_STUB_SIZE * 9)
#define RR_PAGE_BREAKPOINT_VALUE (RR_PAGE_IN_REPLAY_FLAG + 4)

/* Not ABI stable - in record page only */
#define RR_PAGE_FF_BYTES RR_PAGE_BREAKPOINT_VALUE

#define RR_DL_RUNTIME_RESOLVE_CLEAR_FIP (RR_PAGE_ADDR - PRELOAD_LIBRARY_PAGE_SIZE)

/* PRELOAD_THREAD_LOCALS_ADDR should not change.
 * Tools depend on this address. */
#define PRELOAD_THREAD_LOCALS_ADDR (RR_PAGE_ADDR + PRELOAD_LIBRARY_PAGE_SIZE)
#ifdef __aarch64__
#define PRELOAD_THREAD_LOCAL_SCRATCH2_SIZE (1024 + 8 * 2)
#else
#define PRELOAD_THREAD_LOCAL_SCRATCH2_SIZE 0
#endif
#define PRELOAD_THREAD_LOCALS_SIZE (144 + PRELOAD_THREAD_LOCAL_SCRATCH2_SIZE)

#include "rrcalls.h"

/* Define macros that let us compile a struct definition either "natively"
 * (when included by preload.c) or as a template over Arch for use by rr.
 */
#if defined(RR_IMPLEMENT_PRELOAD) || defined(RR_IMPLEMENT_AUDIT)
#define TEMPLATE_ARCH
#define PTR(T) T*
#define PTR_ARCH(T) T*
#define EMBED_STRUCT(T) struct T
#define VOLATILE volatile
#define SIGNED_LONG long
#define UNSIGNED_LONG unsigned long
#else
#define TEMPLATE_ARCH template <typename Arch>
#define PTR(T) typename Arch::template ptr<T>
#define PTR_ARCH(T) typename Arch::template ptr<T<Arch>>
#define EMBED_STRUCT(T) T<Arch>
#define VOLATILE
#define SIGNED_LONG typename Arch::signed_long
#define UNSIGNED_LONG typename Arch::unsigned_long
#endif

#define PATCH_IS_MULTIPLE_INSTRUCTIONS (1 << 0)
/* The syscall instruction is the last instruction in the patched area
 * (rather than the first), which requires special handling.
 */
#define PATCH_SYSCALL_INSTRUCTION_IS_LAST (1 << 1)
/* All instructions in the patch are nop and their execution is thus not
 * observable. This may allow more aggressive handling of interfering branches.
 */
#define PATCH_IS_NOP_INSTRUCTIONS (1 << 2)


/**
 * To support syscall buffering, we replace syscall instructions with a "call"
 * instruction that calls a hook in the preload library to handle the syscall.
 * Since the call instruction takes more space than the syscall instruction,
 * the patch replaces one or more instructions after the syscall instruction as
 * well; those instructions are folded into the tail of the hook function
 * and we have multiple hook functions, each one corresponding to an
 * instruction that follows a syscall instruction.
 * Each instance of this struct describes an instruction that can follow a
 * syscall and a hook function to patch with.
 *
 * This is not (and must not ever be) used during replay so we can change it
 * without bumping SYSCALLBUF_PROTOCOL_VERSION.
 */
struct syscall_patch_hook {
  uint8_t flags;
  uint8_t patch_region_length;
  /* Avoid any padding or anything that would make the layout arch-specific. */
  uint8_t patch_region_bytes[14];
  uint64_t hook_address;
};

/**
 * We buffer mprotect syscalls. Their effects need to be noted so we can
 * update AddressSpace's cache of memory layout, which stores prot bits. So,
 * the preload code builds a list of mprotect_records corresponding to the
 * mprotect syscalls that have been buffered. This list is read by rr whenever
 * we flush the syscallbuf, and its effects performed. The actual mprotect
 * syscalls are performed during recording and replay.
 *
 * We simplify things by making this arch-independent.
 */
struct mprotect_record {
  uint64_t start;
  uint64_t size;
  int32_t prot;
  int32_t padding;
};

/**
 * Must be arch-independent.
 * Variables used to communicate between preload and rr.
 * We package these up into a single struct to simplify the preload/rr
 * interface.
 * You can add to the end of this struct without breaking trace compatibility,
 * but don't move existing fields. Do not write to it during replay except for
 * the 'in_replay' field. Be careful reading fields during replay as noted
 * below, since they don't all exist in all trace versions.
 */
struct preload_globals {
  /* RESERVED in current versions of rr.
   *
   * QUIRK: With UsesGlobalsInReplayQuirk:
   * 0 during recording, 1 during replay. Set by rr.
   * This MUST NOT be used in conditional branches. It should only be used
   * as the condition for conditional moves so that control flow during replay
   * does not diverge from control flow during recording.
   * We also have to be careful that values different between record and replay
   * don't accidentally leak into other memory locations or registers.
   * USE WITH CAUTION.
   */
  unsigned char reserved_legacy_in_replay;
  /* 0 during recording and replay, 1 during diversion. Set by rr.
   */
  unsigned char in_diversion;
  /* 1 if chaos mode is enabled. DO NOT READ from rr during replay, because
     this field is not initialized in old traces. */
  unsigned char in_chaos;
  /* The signal to use for desched events */
  unsigned char desched_sig;
  /* RESERVED */
  int reserved;
  /**
   * Set by rr.
   * For each fd, indicate a class that is valid for all fds with the given
   * number in all tasks that share this address space. For fds >=
   * SYSCALLBUF_FDS_DISABLED_SIZE - 1, the class is given by by
   * syscallbuf_fd_class[SYSCALLBUF_FDS_DISABLED_SIZE - 1]. See the
   */
  VOLATILE char syscallbuf_fd_class[SYSCALLBUF_FDS_DISABLED_SIZE];

  /* WARNING! SYSCALLBUF_FDS_DISABLED_SIZE can change, so
     access to the following fields during replay is dangerous. Use
     PRELOAD_GLOBALS_FIELD_AFTER_SYSCALLBUF_FDS_DISABLED or something
     like it! */
  /* mprotect records. Set by preload. Use
     PRELOAD_GLOBALS_FIELD_AFTER_SYSCALLBUF_FDS_DISABLED to access. */
  struct mprotect_record mprotect_records[MPROTECT_RECORD_COUNT];
  /* Random seed that can be used for various purposes. DO NOT READ from rr
     during replay, because this field does not exist in old traces. */
  uint64_t random_seed;
  /* RESERVED in current versions of rr.
   *
   * QUIRK: With UsesGlobalsInReplayQuirk:
   * Indicates the value (in 8-byte increments) at which to raise a SIGSEGV
   * trap once reached. NOTE: This remains constant during record, and is
   * used only during replay. The same restrictions as in_replay above apply.
   *
   * Use PRELOAD_GLOBALS_FIELD_AFTER_SYSCALLBUF_FDS_DISABLED to access during
   * replay. */
  uint64_t reserved_legacy_breakpoint_value;
  /* Indicates whether or not all tasks in this address space have the same
     fd table. Set by rr during record (modifications are recorded).
     Read by the syscallbuf. Not read during replay. */
  unsigned char fdt_uniform;
  /* The CPU we're bound to, if any; -1 if not bound. Not read during replay. */
  int32_t cpu_binding;
};

/**
 * Represents syscall params.  Makes it simpler to pass them around,
 * and avoids pushing/popping all the data for calls.
 */
TEMPLATE_ARCH
struct syscall_info {
  SIGNED_LONG no;
  SIGNED_LONG args[6];
};

TEMPLATE_ARCH
struct robust_list_info {
  PTR(void) head;
  uint32_t len;
};

TEMPLATE_ARCH
struct rseq_info {
  PTR(void) rseq;
  uint32_t len;
  uint32_t sig;
};

/**
 * Can be architecture dependent. The rr process does not manipulate
 * these except to save and restore the values on task switches so that
 * the values are always effectively local to the current task. rr also
 * sets the |syscallbuf_stub_alt_stack| field.
 * We use this instead of regular libc TLS because sometimes buggy application
 * code breaks libc TLS for some tasks. With this approach we can be sure
 * thread-locals are usable for any task in any state.
 */
TEMPLATE_ARCH
struct preload_thread_locals {
  /* The offset of this field MUST NOT CHANGE, it is part of the preload ABI
   * rr depends on.
   * Offset of this field is hardcoded in syscall_hook.S and
   * assembly_templates.py.
   * Pointer to alt-stack used by syscallbuf stubs (allocated at the end of
   * the scratch buffer.
   */
  PTR(void) syscallbuf_stub_alt_stack;
  /* The offset of this field MUST NOT CHANGE, it is part of the preload ABI
   * tools can depend on.
   * Where syscall result will be (or during replay, has been) saved.
   */
  PTR(int64_t) pending_untraced_syscall_result;
  /* The offset of this field MUST NOT CHANGE, it is part of the preload ABI
   * rr depends on.
   * Scratch space used by stub code.
   */
  PTR(void) stub_scratch_1;
  /* The offset of this field MUST NOT CHANGE, it is part of the preload ABI
   * rr depends on.
   */
  int32_t alt_stack_nesting_level;
  /* Syscall hook saved flags (bottom 16 bits only) */
  int32_t saved_flags;
  /* The offset of this field MUST NOT CHANGE, it is part of the preload ABI
   * rr depends on. It contains the parameters to the patched syscall, or
   * zero if we're not processing a buffered syscall. Do not depend on this
   * existing during replay, some traces with SYSCALLBUF_PROTOCOL_VERSION 0
   * don't have it.
   */
  PTR_ARCH(const struct syscall_info) original_syscall_parameters;

  /* Nonzero when thread-local state like the syscallbuf has been
   * initialized.  */
  int32_t thread_inited;
  /* The offset of this field MUST NOT CHANGE, it is part of the ABI tools
   * depend on. When buffering is enabled, points at the thread's mapped buffer
   * segment.  At the start of the segment is an object of type |struct
   * syscallbuf_hdr|, so |buffer| is also a pointer to the buffer
   * header. */
  PTR(uint8_t) buffer;
  UNSIGNED_LONG buffer_size;
  /* This is used to support the buffering of "may-block" system calls.
   * The problem that needs to be addressed can be introduced with a
   * simple example; assume that we're buffering the "read" and "write"
   * syscalls.
   *
   *  o (Tasks W and R set up a synchronous-IO pipe open between them; W
   *    "owns" the write end of the pipe; R owns the read end; the pipe
   *    buffer is full)
   *  o Task W invokes the write syscall on the pipe
   *  o Since write is a buffered syscall, the seccomp filter traps W
   *    directly to the kernel; there's no trace event for W delivered
   *    to rr.
   *  o The pipe is full, so W is descheduled by the kernel because W
   *    can't make progress.
   *  o rr thinks W is still running and doesn't schedule R.
   *
   * At this point, progress in the recorded application can only be
   * made by scheduling R, but no one tells rr to do that.  Oops!
   *
   * Thus enter the "desched counter".  It's a perf_event for the "sw t
   * switches" event (which, more precisely, is "sw deschedule"; it
   * counts schedule-out, not schedule-in).  We program the counter to
   * deliver a signal to this task when there's new counter data
   * available.  And we set up the "sample period", how many descheds
   * are triggered before the signal is delivered, to be "1".  This
   * means that when the counter is armed, the next desched (i.e., the
   * next time the desched counter is bumped up) of this task will
   * deliver the signal to it.  And signal delivery always generates a
   * ptrace trap, so rr can deduce that this task was descheduled and
   * schedule another.
   *
   * The description above is sort of an idealized view; there are
   * numerous implementation details that are documented in
   * handle_signal.c, where they're dealt with. */
  int32_t desched_counter_fd;
  int32_t cloned_file_data_fd;
  SIGNED_LONG cloned_file_data_offset;
  PTR(void) scratch_buf;
  UNSIGNED_LONG usable_scratch_size;

  PTR(struct msghdr) notify_control_msg;

  /* The offset of this field MUST NOT CHANGE, it is part of the preload ABI
   * rr depends on, on ARM.
   */
  uint8_t stub_scratch_2[PRELOAD_THREAD_LOCAL_SCRATCH2_SIZE];

  /** When the size is non-zero, there has been a buffered set_robust_list
   * that must be accounted for. Set by preload code only, read by rr
   * only during recording.
   */
  EMBED_STRUCT(robust_list_info) robust_list;

  /** True when either a buffered rseq or unbuffered rseq has been called
   * for this thread. Set by rr for buffered rseq and preload for unbuffered
   * rseq. */
  int32_t rseq_called;

  /** When the len is non-zero, there has been a buffered rseq
   * that must be accounted for. Set by preload code only, read by rr
   * only during recording.
   */
  EMBED_STRUCT(rseq_info) rseq;
};
#if defined(__aarch64__) && (defined(RR_IMPLEMENT_PRELOAD) || \
                             defined(RR_IMPLEMENT_AUDIT))
// On aarch64, we the stub_scratch_2 offset is hardcoded in the syscallbuf code
_Static_assert(offsetof(struct preload_thread_locals, stub_scratch_2) == 8 * 13,
               "stub_scratch_2 offset mismatch");
#endif

// The set of flags that can be set for each fd in syscallbuf_fds_disabled.
enum syscallbuf_fd_classes {
  // fd is invalid, all syscalls will error (syscallbuf internal use only)
  FD_CLASS_INVALID = -1,
  // The fd is allowed to be completely untraced. No notification to the
  // syscall buf is required.
  FD_CLASS_UNTRACED = 0x0,
  // This is the most conservative option. All operations on this fd are
  // always traced. If there is a conflict between other options, this one
  // should be chosen.
  FD_CLASS_TRACED   = 0x1,
  // This fd either refers to a /proc/<pid>/mem or is untrace (if this as
  // is shared with another fd table)
  FD_CLASS_PROC_MEM = 0x2,
};

#define CURRENT_INIT_PRELOAD_PARAMS_VERSION 2

/**
 * Packs up the parameters passed to |SYS_rrcall_init_preload|.
 * We use this struct because it's a little cleaner.
 * When evolving this struct, add new fields at the end and don't
 * depend on them during replay.
 */
TEMPLATE_ARCH
struct rrcall_init_preload_params {
  /* All "In" params. */
  /* The syscallbuf lib's idea of whether buffering is enabled.
   * We let the syscallbuf code decide in order to more simply
   * replay the same decision that was recorded. */
  int syscallbuf_enabled;
  int syscall_patch_hook_count;
  PTR(struct syscall_patch_hook) syscall_patch_hooks;
  PTR(void) unused;
  PTR(void) syscallbuf_code_start;
  PTR(void) syscallbuf_code_end;
  PTR(void) get_pc_thunks_start;
  PTR(void) get_pc_thunks_end;
  PTR(void) syscallbuf_final_exit_instruction;
  PTR(struct preload_globals) globals;
  union {
    struct {
    /* Address of the first entry of the breakpoint table.
     * After processing a sycallbuf record (and unlocking the syscallbuf),
     * we call a function in this table corresponding to the record processed.
     * rr can set a breakpoint in this table to break on the completion of a
     * particular syscallbuf record.
     * This method of setting the breakpoint is deprecated. Instead, use the
     * interface below. It is retained for compatibility */
      PTR(void) breakpoint_table;
      int breakpoint_table_entry_size;
    };
    struct {
      PTR(void) breakpoint_instr_addr;
      // Set of -1 to indicate non-legacy mode
      int breakpoint_mode_sentinel;
    };
  };
  PTR(void) syscallbuf_syscall_hook;
};

/**
 * Packs up the inout parameters passed to |SYS_rrcall_init_buffers|.
 * We use this struct because there are too many params to pass
 * through registers on at least x86.  (It's also a little cleaner.)
 */
TEMPLATE_ARCH
struct rrcall_init_buffers_params {
  /* The fd we're using to track desched events. */
  int desched_counter_fd;
  /* "Out" params. */
  int cloned_file_data_fd;
  /* Returned pointer to and size of the shared syscallbuf
   * segment. */
  PTR(void) syscallbuf_ptr;
  /* Returned pointer to rr's syscall scratch buffer */
  PTR(void) scratch_buf;
  uint32_t syscallbuf_size;
  uint32_t usable_scratch_size;
};

/**
 * The syscall buffer comprises an array of these variable-length
 * records, along with the header below.
 */
struct syscallbuf_record {
  /* Return value from the syscall.  This can be a memory
   * address, so must be as big as a memory address can be.
   * We use 64 bits rather than make syscallbuf_record Arch-specific as that
   * gets cumbersome.
   */
  int64_t ret;
  /* Syscall number.
   *
   * NB: the x86 linux ABI has 350 syscalls as of 3.9.6 and
   * x86-64 defines 313, so this is a pretty safe storage
   * allocation.  It would be an earth-shattering event if the
   * syscall surface were doubled in a short period of time, and
   * even then we would have a comfortable cushion.  Still,
   *
   * TODO: static_assert this can hold largest syscall num */
  uint16_t syscallno;
  /* Did the tracee arm/disarm the desched notification for this
   * syscall? */
  uint8_t desched : 1;
  /* Does this record require an assist during replay ? */
  uint8_t replay_assist : 1;
  uint8_t _flags_padding : 6;
  uint8_t _padding;
  /* Size of entire record in bytes: this struct plus extra
   * recorded data stored inline after the last field, not
   * including padding.
   *
   * TODO: static_assert this can repr >= buffer size */
  uint32_t size;
  /* Extra recorded outparam data starts here. */
  uint8_t extra_data[0];
};

/**
 * This struct summarizes the state of the syscall buffer.  It happens
 * to be located at the start of the buffer.
 */
struct syscallbuf_hdr {
  /* The number of valid syscallbuf_record bytes in the buffer,
   * not counting this header.
   * Make this volatile so that memory writes aren't reordered around
   * updates to this field. */
  volatile uint32_t num_rec_bytes;
  /* Number of mprotect calls since last syscallbuf flush. The last record in
   * the list may not have been applied yet.
   */
  volatile uint32_t mprotect_record_count;
  /* Number of records whose syscalls have definitely completed.
   * May be one less than mprotect_record_count.
   */
  volatile uint32_t mprotect_record_count_completed;
  /* True if the current syscall should not be committed to the
   * buffer, for whatever reason; likely interrupted by
   * desched. Set by rr. */
  volatile uint8_t abort_commit;
  /* True if, next time we exit the syscall buffer hook, libpreload should
   * execute SYS_rrcall_notify_syscall_hook_exit to give rr the opportunity to
   * deliver a signal and/or reset the syscallbuf. */
  volatile uint8_t notify_on_syscall_hook_exit;
  /* This tracks whether the buffer is currently in use for a
   * system call or otherwise unavailable. This is helpful when
   * a signal handler runs during a wrapped system call; we don't want
   * it to use the buffer for its system calls. The different reasons why the
   * buffer could be locked, use different bits of this field and the buffer
   * may be used only if all are clear. See enum syscallbuf_locked_why for
   * used bits.
   */
  volatile uint8_t locked;
  /* Nonzero when rr needs to worry about the desched signal.
   * When it's zero, the desched signal can safely be
   * discarded. */
  volatile uint8_t desched_signal_may_be_relevant;
  /* A copy of the tasks's signal mask. Updated by preload when a buffered
   * rt_sigprocmask executes.
   */
  volatile uint64_t blocked_sigs;
  /* Incremented by preload every time a buffered rt_sigprocmask executes.
   * Cleared during syscallbuf reset.
   */
  volatile uint32_t blocked_sigs_generation;
  /* Nonzero when preload is in the process of calling an untraced
   * sigprocmask; the real sigprocmask may or may not match blocked_sigs.
   */
  volatile uint8_t in_sigprocmask_critical_section;
  /* Nonzero when the syscall was aborted during preparation without doing
   * anything. This is set when a user seccomp filter forces a SIGSYS. */
  volatile uint8_t failed_during_preparation;

  uint8_t padding[2];
} __attribute__((__packed__));
#ifdef __cplusplus
static_assert(sizeof(struct syscallbuf_hdr) % 8 == 0,
              "syscallbuf_hdr size must be multiple of 8");
#endif

/**
 * Each bit of of syscallbuf_hdr->locked indicates a reason why the syscallbuf
 * is locked. These are all the bits that are currently defined.
 */
enum syscallbuf_locked_why {
  /* Used by the tracee, during interruptible syscalls to avoid recursion */
  SYSCALLBUF_LOCKED_TRACEE = 0x1,
  /* Used by the tracer to prevent syscall buffering when necessary to preserve
     semantics (e.g. for ptracees whose syscalls are being observed) */
  SYSCALLBUF_LOCKED_TRACER = 0x2
};

/**
 * Return the amount of space that a record of |length| will occupy in
 * the buffer if committed, including padding.
 */
inline static long stored_record_size(size_t length) {
  /* Round up to a whole number of 64-bit words. */
  return (length + 7) & ~7;
}

/**
 * Return nonzero if an attempted open() of |filename| should be
 * blocked.
 *
 * The background of this hack is that rr doesn't support DRI/DRM
 * currently, so we use the blunt stick of refusing to open this
 * interface file as a way of disabling it entirely.  (In addition to
 * tickling xorg.conf, which doesn't entirely do the trick.)  It's
 * known how to fix this particular, so let's not let this hack grow
 * too much by piling on.
 */
inline static int is_blacklisted_filename(const char* filename) {
  const char* f;
  if (strprefix("/dev/dri/", filename) || streq("/dev/nvidiactl", filename) ||
      streq("/usr/share/alsa/alsa.conf", filename) ||
      streq("/dev/nvidia-uvm", filename)) {
    return 1;
  }
  f = extract_file_name(filename);
  return strprefix("rr-test-blacklist-file_name", f) ||
         strprefix("pulse-shm-", f);
}

inline static int is_blacklisted_memfd(const char* name) {
  return streq("pulseaudio", name);
}

inline static int is_blacklisted_socket(const char* filename) {
  /* Blacklist the nscd socket because glibc communicates with the daemon over
   * shared memory rr can't handle.
   */
  return streq("/var/run/nscd/socket", filename);
}

inline static int is_gcrypt_deny_file(const char* filename) {
  return streq("/etc/gcrypt/hwf.deny", filename);
}

inline static int is_terminal(const char* filename) {
  return strprefix("/dev/tty", filename) || strprefix("/dev/pts", filename);
}

inline static int is_proc_mem_file(const char* filename) {
  if (!strprefix("/proc/", filename)) {
    return 0;
  }
  return streq(filename + rrstrlen(filename) - 4, "/mem");
}

inline static int is_proc_fd_dir(const char* filename) {
  if (!strprefix("/proc/", filename)) {
    return 0;
  }

  int len = rrstrlen(filename);
  const char* fd_bit = filename + len;
  if (*fd_bit == '/') {
    fd_bit--;
  }

  return strprefix("/fd", fd_bit - 3);
}

inline static int is_sys_cpu_online_file(const char* filename) {
  return streq("/sys/devices/system/cpu/online", filename);
}

inline static int is_proc_stat_file(const char* filename) {
  return streq("/proc/stat", filename);
}

inline static int is_rr_page_lib(const char* filename) {
  return streq(extract_file_name(filename), RRPAGE_LIB_FILENAME) ||
         streq(extract_file_name(filename), RRPAGE_LIB_FILENAME_32);
}

/**
 * Returns nonzero if an attempted open() of |filename| can be syscall-buffered.
 * When this returns zero, the open must be forwarded to the rr process.
 * |filename| must be absolute.
 * This is imperfect because it doesn't handle hard links and files (re)mounted
 * in different places.
 */
inline static int allow_buffered_open(const char* filename) {
  return filename &&
         !is_blacklisted_filename(filename) && !is_gcrypt_deny_file(filename) &&
         !is_terminal(filename) && !is_proc_mem_file(filename) &&
         !is_proc_fd_dir(filename) && !is_sys_cpu_online_file(filename) &&
         !is_proc_stat_file(filename) && !is_rr_page_lib(filename);
}

#endif /* RR_PRELOAD_INTERFACE_H_ */
